…
## ==========================================================================
## Installing Libraries
## ==========================================================================
## Install Several Required packages:
install.packages(c("devtools", "gplots",
"ggplot2", "igraph",
"lattice", "knitr","rsample",
"RColorBrewer", "rmarkdown",
"stringr", "UpSetR", "vcfR",
"pcaPP", "SciViews", "tidyverse"))
## ---------------------------------------------------------------------------
## TCGAbiolinks package (from github):
devtools::install_github(repo = "BioinformaticsFMRP/TCGAbiolinks")
## ---------------------------------------------------------------------------
## Bioconductor packages: (R >= 3.5)
if(!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install(c("IRanges", "GenomicRanges", "GenomicFeatures",
"org.Hs.eg.db", "Rsamtools",
"SummarizedExperiment",
"TxDb.Hsapiens.UCSC.hg19.knownGene",
"TxDb.Hsapiens.UCSC.hg38.knownGene",
"VariantAnnotation"))
## ---------------------------------------------------------------------------
## Install ISMA - For Integrative retrieval
## and analysis of Mutations
install.packages("isma/isma_0.1.4.tar.gz", repos=NULL)
## ---------------------------------------------------------------------------
## Install mND - For Multi Network Diffusion
install.packages("mND/mND_0.1.7.tar.gz", repos = NULL)
## ---------------------------------------------------------------------------
## Get GeneSurrounder.R and calc_p.R from github and Source it
source("genesurrounder/GeneSurrounder.R")
source("genesurrounder/run_geneSurrounder.R")
source("mND/calc_p.R")
## ==========================================================================
## Loading Libraries
## ==========================================================================
library(isma)
library(mND)
library(limma)
library(igraph)
library(pcaPP)
library(SciViews)
library(tidyverse)
library(lubridate)
## ==========================================================================
## First we Try mND Only - Without GeneSurrounder
## ==========================================================================
## Normalize the Adjacency Matrix
W <- normalize_adj_mat(A)
## Permute the Layers Matrix
X0_perm <- perm_X0(X0, r = 50, W, seed_n = 2)
## Perform Network Diffusion - Non-Windows
#Xs <- ND(X0_perm, W, cores = 2)
## Perform Network Diffusion - Windows
Xs <- ND(X0_perm, W)
saveRDS(Xs, "Data/Xs.rds")
## Load Data without Running above Code
data(Xs)
## Get Indices of Adjacent Neighbours
ind_adj <- neighbour_index(W)
## Perform mND considering sets of 3-neighbors - Non-Windows
mND_score <- mND(Xs, ind_adj, k = 3, cores = 2)
## Perform mND considering sets of 3-neighbors - Windows
mND_score <- mND(Xs, ind_adj, k = 3)
mND_score <- signif_assess(mND_score)
saveRDS(mND_score, "Data/mND_scores.rds")
Strategy:
- Run GeneSurrounder to obtain p-decay
- Use p-decay to adjust the Differential Expression Values
\[Adjusted_{DE} = -log_{10}(p_{decay}) * Original_{DE}\]
- Replace DE Layer with Adjusted DE
- Run mND on the new Data with Adjusted Layer
- Save the Results & Compare with Previous Protocol
## Load in the Data
data(X0)
data(A)
## Get a Correlation Matrix (Required to run GS)
ge <- X0[,2]
ge_no_0 <- data.frame(ge[ge > 0])
ge_cor <- calcCorMatrix(ge_no_0, corMethod = "pearson", exprName = "ge_no_0",
useMethod = "everything")
## Get a resampled GE matrix (Required to run GS)
ge_resampled <- data.frame(matrix(ncol = length(ge[ge > 0]), nrow = 1000))
colnames(ge_resampled) <- names(ge[ge > 0])
set.seed(123)
for(i in 1:1000){
ge_resampled[i,] <- sample(ge[ge > 0], length(ge[ge > 0]), replace = TRUE)
}
## Get the interaction network & distance matrix (Required to run GS)
int_network <- graph_from_adjacency_matrix(A, mode = "undirected")
ge_dist <- calcAllPairsDistances(int_network, directionPaths = "all",
networkName = "int_network")
#> Run Gene Surrounder
#> Takes as parameters:
#> 1. Genes distance matrix
#> 2. GE correlation matrix
#> 3. GE layer with ge > 0
#> 4. Resampled GE matrix
#> 5. Gene IDs of > 0 GE
#> 6. Output file name
#> NOTE: Suggested to run in CHUNKS - Takes too long!
gs_results_all <- run_geneSurrounder(distance.matrix = ge_dist,
cor.matrix = ge_cor,
geneStats.observed = ge[ge > 0],
perm.geneStats.matrix = as.matrix(ge_resampled),
diameter = diam,
num.Sphere.resamples = 1000,
gene.id = names(ge[ge > 0]),
decay_only = TRUE,
file_name = "gs_results_all.csv"
)
## Save Results to CSV
write.csv(gs_results_all, "Data/gs_results_all.csv")
##---------------------------------------------------------------##
## Step 2: mND on GeneSurrounder - Adjusted scores ##
##---------------------------------------------------------------##
## 1. Read the results of Gene Surrounder
gs_results_all <- read.csv("Data/gs_results_all.csv") %>% mutate(time = duration(time)) %>% select(-X)
rownames(gs_results_all) <- gs_results_all$gene.id
## 2. Adjust the Scores of one Layer
## Adjusted score = -log10(p.Decay)
X0_gs_adjusted <- data.frame(X0) %>%
rownames_to_column('rn') %>%
mutate(L2 = if_else(rn %in% gs_results_all$gene.id, -log10(gs_results_all[rn,]$p.Decay)*L2, L2)) %>%
column_to_rownames('rn')
X0_gs_adjusted <- as.matrix(X0_gs_adjusted)
## Save Results to RDS
saveRDS(X0_gs_adjusted, "Data/X0_gs_adjusted.rds")
## Load the Results
data(X0)
X0_gs_adjusted <- readRDS("Data/X0_gs_adjusted.rds")
## 3. Visualize the difference between
## the two layers: old & adjusted
## Plot Results with Density plot
ggplot(data.frame(X0), aes(L2)) + geom_density()
ggplot(data.frame(X0_gs_adjusted), aes(L2)) + geom_density()
## 4. Run mND on Adjusted Values
## take different k-values to run mND
## Load Adjacency Matrix
data(A)
## Normalize
W <- normalize_adj_mat(A)
ind_adj <- neighbour_index(W)
## Permute Adjusted Matrix
X0_perm_new <- perm_X0(X0_gs_adjusted, r = 50, W, seed_n = 2)
## ND on new Adjusted Matrix
Xs_new <- ND(X0_perm_new, W, cores = 2)
## mND score on Adjusted Matrix
mND_score_new <- mND(Xs_new, ind_adj, k=3, cores = 2)
mND_score_new_k2 <- mND(Xs_new, ind_adj, k=2, cores = 2)
## mND p-value Significance Assessment
mND_score_new <- signif_assess(mND_score_new)
mND_score_new_k2 <- signif_assess(mND_score_new_k2)
## 5. Save Results in RDS
saveRDS(Xs_new, "Data/Xs_new.rds")
saveRDS(mND_score_new, "Data/mND_gs_adjusted_scores.rds")
saveRDS(mND_score_new_k2, "Data/mND_gs_adjusted_scores_k2.rds")
## Load Data
## Adjacency Matrix
data(A)
## Normalized AM
W <- normalize_adj_mat(A)
ind_adj <- neighbour_index(W)
## Matrices with Original Values
data(X0)
data(Xs)
## Matrices with Adjusted Values
X0_gs_adjusted <- readRDS("Data/X0_gs_adjusted.rds")
Xs_new <- readRDS("Data/Xs_new.rds")
## mND Scores Results
mND_score <- readRDS("Data/mND_scores.rds")
mND_score_new <- readRDS("Data/mND_gs_adjusted_scores.rds")
mND_score_new_k2 <- readRDS("Data/mND_gs_adjusted_scores_k2.rds")
## mND Alone Classification ##
##---------------------------------------##
# H1: genes with a mutation frequency greater than zero;
# H2: top 1200 differentially expressed genes (FDR < 10^-7).
# Further, we set the cardinalities of gene sets N1 and N2,
# containing the genes with the highest scoring neighborhoods,
# as |H1|=|N1| and |H2|=|N2|
Hl <- list(l1 = rownames(X0[X0[,1]>0,]),
l2 = names(X0[order(X0[,2], decreasing = T),2][1:1200])
)
top_Nl <- unlist(lapply(Hl, function(x) length(x)))
top_Nl
## l1 l2
## 1238 1200
class_res <- classification(mND_score, X0, Hl, top = top_Nl)
## Classification of genes in every layer
head(class_res$gene_class)
## L1 L2
## TP53 I L
## TRIM28 M NS
## CCNB1 M M
## RPS27A M L
## UBA52 M L
## CCNA2 L M
## Occurrence of (M; L; I; NS)
## for each gene across layers
head(class_res$occ_labels)
## I L M NS
## TP53 0.5 0.5 0.0 0.0
## TRIM28 0.0 0.0 0.5 0.5
## CCNB1 0.0 0.0 1.0 0.0
## RPS27A 0.0 0.5 0.5 0.0
## UBA52 0.0 0.5 0.5 0.0
## CCNA2 0.0 0.5 0.5 0.0
## Plot the results of mND Score
plot_results(mND_score, class_res, W, n = 150, directory = "Results/mND_results/")
## png
## 2
## Optimizing k (Mac only)
k_val <- seq(1,6,1)
k_results <- optimize_k(Xs, X0, k_val, ind_adj, W, top = 200, cores = 2)
k_results <- data.frame(k_results)
colnames(k_results) <- k_val
## Save Results to CSV
## can also be loaded through data(k_results) but is a list not data.frame
write.csv(k_results, "Data/k_results.csv")
## New Results -- GS Adjusted mND Results ##
## -------------------------------------- ##
#H1: genes with a mutation frequency greater than zero;
#H2: top 1200 differentially expressed genes (FDR < 10^-7).
#Further, we set the cardinalities of gene sets N1 and N2,
# containing the genes with the highest scoring neighborhoods,
# as |H1|=|N1| and |H2|=|N2|
Hl_new <- list(l1 = rownames(X0[X0[,1]>0,]),
l2 = names(X0_gs_adjusted[order(X0_gs_adjusted[,2], decreasing = T),2][1:1200])
)
top_Nl_new <- unlist(lapply(Hl_new, function(x) length(x)))
top_Nl_new
## l1 l2
## 1238 1200
class_res_new <- classification(mND_score_new, X0_gs_adjusted, Hl_new, top = top_Nl_new)
## Classification of genes in every layer
head(class_res_new$gene_class)
## L1 L2
## TP53 I L
## TRIM28 M NS
## CCNB1 M M
## CCNA2 L M
## AURKB L M
## TPX2 L M
## Occurrence of (M; L; I; NS) for each gene across layers
head(class_res_new$occ_labels)
## I L M NS
## TP53 0.5 0.5 0.0 0.0
## TRIM28 0.0 0.0 0.5 0.5
## CCNB1 0.0 0.0 1.0 0.0
## CCNA2 0.0 0.5 0.5 0.0
## AURKB 0.0 0.5 0.5 0.0
## TPX2 0.0 0.5 0.5 0.0
## Plotting new mND Scores Results
plot_results(mND_score_new, class_res_new, W, n = 150, directory = "Results/mND_results_new/")
## png
## 2
plot_results(mND_score_new, class_res_new, W, n = 150, directory = "Results/mND_results_new_k2/")
## png
## 2
## Optimizing k (Mac only)
k_results_new <- optimize_k(Xs_new, X0_gs_adjusted, k_val, ind_adj, W, top = 200, cores = 2)
k_results_new <- data.frame(k_results_new)
colnames(k_results_new) <- k_val
## Saving Results - Optional
## k = 2 seems like a better choice in this case
## write.csv(k_results_new, "Data/k_results_new.csv")
## Sanity check:
## Get new Classes - new Results
class_res_new$gene_class <- class_res_new$gene_class[match(rownames(class_res$gene_class), rownames(class_res_new$gene_class)),]
#> Results for Somatic Mutations Layer
sum(class_res_new$gene_class[,1] != class_res$gene_class[,1])
## [1] 0
shift_sm <- table(mND = class_res$gene_class[,1], GS_adjusted_mND = class_res_new$gene_class[,1])
#> Results for Gene Expression Adjusted Layer
sum(class_res_new$gene_class[,2] != class_res$gene_class[,2])
## [1] 888
shift_ge <- table(mND = class_res$gene_class[,2], GS_adjusted_mND = class_res_new$gene_class[,2])
## Confusion Matrix Percentages over original mND genes
results_ge_mND <- shift_ge
for(i in 1:nrow(shift_ge)){
results_ge_mND[i,] <- (shift_ge[i,]/sum(shift_ge[i,]))*100
}
results_ge_mND
## GS_adjusted_mND
## mND I L M NS
## I 76.70639220 0.00000000 3.57529794 19.71830986
## L 0.27347311 69.73564266 2.55241568 27.43846855
## M 4.69314079 1.80505415 88.44765343 5.05415162
## NS 1.70544268 1.46331193 0.08421939 96.74702600
## Total selected mND genes (I, L, M)
11796 - sum(shift_ge[4,]) #2297 k = 3
## [1] 2297
## Percentages over GS_adjusted genes
results_ge_GS_adjusted <- shift_ge
for(i in 1:ncol(shift_ge)){
results_ge_GS_adjusted[,i] <- (shift_ge[,i]/sum(shift_ge[,i]))*100
}
results_ge_GS_adjusted
## GS_adjusted_mND
## mND I L M NS
## I 79.9097065 0.0000000 10.5095541 1.8788066
## L 0.3386005 84.1584158 8.9171975 3.1072571
## M 1.4672686 0.5500550 78.0254777 0.1445236
## NS 18.2844244 15.2915292 2.5477707 94.8694126
## Total selected GS-adjusted mND genes (I, L, M)
#> 2109 k = 3
11796 - sum(shift_ge[,4])
## [1] 2109
#> Difference btween k=3 and k=2
#> -188 genes k = 3, -205 k = 2
11796 - sum(shift_ge[,4]) - (11796 - sum(shift_ge[4,]))
## [1] -188
## Difference in M
## GS-adjusted (314 with k = 3)
## --> +37 modules
sum(shift_ge[,3])
## [1] 314
## Percent over all genes
(shift_ge/11796)*100
## GS_adjusted_mND
## mND I L M NS
## I 6.00203459 0.00000000 0.27975585 1.54289590
## L 0.02543235 6.48524924 0.23736860 2.55171244
## M 0.11020685 0.04238725 2.07697525 0.11868430
## NS 1.37334690 1.17836555 0.06781960 77.90776534
library(visNetwork)
library(RCy3)
ls("package:igraph", pattern = "^layout_.")
## mND graph
genes_subset_mND <- class_res$gene_class %>% filter(L2 != "NS")
A_subset_mND <- A[rownames(A) %in% rownames(genes_subset_mND),
colnames(A) %in% rownames(genes_subset_mND)]
graph_mND <- graph_from_adjacency_matrix(A_subset_mND) %>%
set_vertex_attr("class", value = genes_subset_mND[,2])
vis_mND <- toVisNetworkData(graph_mND)
visNetwork(nodes = vis_mND$nodes, edges = vis_mND$edges) %>%
visIgraphLayout(layout = "layout_") %>%
visOptions(nodesIdSelection = TRUE, selectedBy = "class") %>%
visNodes(color = vis_mND$nodes$class)
#> Trying Cytoscape: Took a loong time
# cytoscapePing()
# createNetworkFromIgraph(graph_mND,new.title='mND')
## 1 = Isolated, 2 = Linker, 3 = Module
## GS-adjusted mND graph
genes_subset_adjusted_mND <- class_res_new$gene_class %>% filter(L2 != "NS")
A_subset_adjusted_mND <- A[rownames(A) %in% rownames(genes_subset_adjusted_mND),
colnames(A) %in% rownames(genes_subset_adjusted_mND)]
graph_adjusted_mND <- graph_from_adjacency_matrix(A_subset_adjusted_mND) %>%
set_vertex_attr("class", value = genes_subset_adjusted_mND[,2])
vis_adjusted_mND <- toVisNetworkData(graph_adjusted_mND)
visNetwork(nodes = vis_adjusted_mND$nodes, edges = vis_adjusted_mND$edges) %>%
visIgraphLayout(layout = "layout_in_circle") %>%
visOptions(nodesIdSelection = TRUE, selectedBy = "class")
## ==========================================================================
## Analysis of Results - Bilal
## A. Checking Old vs New Genes Count
## B. Introducing Cumulative Decay Score
## ==========================================================================
## A. Checking Old vs New Genes Count
## ----------------------------------
## Load Data
#> Adjacency Matrix
data(A)
## Normalized AM
W <- normalize_adj_mat(A)
ind_adj <- neighbour_index(W)
## Matrices with Original Values
data(X0)
data(Xs)
## Matrices with Adjusted Values
X0_gs_adjusted <- readRDS("Data/X0_gs_adjusted.rds")
Xs_new <- readRDS("Data/Xs_new.rds")
## mND Scores Results
mND_score <- readRDS("Data/mND_scores.rds")
mND_score_new_k2 <- readRDS("Data/mND_gs_adjusted_scores_k2.rds")
mND_score_new <- readRDS("Data/mND_gs_adjusted_scores.rds")
## Get new and old genes
new_genes = rownames(mND_score_new$mND)[which(mND_score_new$mND$mNDp < 0.05)]
length(new_genes)
## [1] 6313
old_genes = rownames(mND_score$mND)[which(mND_score$mND$mNDp < 0.05)]
length(old_genes)
## [1] 5777
## Ratio - we have a 7% change in the genes
## Ghadi: " 7.7% of the total genes were additionally
## significant with a 0.05 mNDp threshold "
sum(!(new_genes %in% old_genes))/nrow(mND_score$mND)
## [1] 0.07782299
# Ratio would be length(new_genes)/length(old_genes)
# (diff lengths means hard to get a single percentage to reflect change)
(length(new_genes)-length(old_genes))/length(new_genes)
## [1] 0.08490417
After obtaining the classification of each gene
we wish to check the cumulative_decay_score on target genes
Target Genes are new genes that are obtained
Cumulative Decay Score = GE_neighbor/d_neighbor_target
Stategy:
For ever node in the graph:
….for all modules in the list of old modules:
….distance = get_distance (old_module, node)
….score[node] += Measure(old_module)/distance
————————————————
Now, this Measure could be taken as GE or the adjusted -log_10(p-value) ————————————————-
## Introducing Cumulative Decay Score
## -------------------------------------
## Load Classificaiton of Old vs New Genes
data(X0)
Hl_old <- list(l1 = rownames(X0[X0[,1]>0,]),
l2 = names(X0[order(X0[,2], decreasing = T),2][1:1200])
)
top_Nl_old <- unlist(lapply(Hl_old, function(x) length(x)))
top_Nl_old
## l1 l2
## 1238 1200
class_res_old <- classification(mND_score, X0, Hl_old, top = top_Nl_old)
head(class_res_old$gene_class)
## L1 L2
## TP53 I L
## TRIM28 M NS
## CCNB1 M M
## RPS27A M L
## UBA52 M L
## CCNA2 L M
Hl_new <- list(l1 = rownames(X0_gs_adjusted[X0_gs_adjusted[,1]>0,]),
l2 = names(X0_gs_adjusted[order(X0_gs_adjusted[,2], decreasing = T),2][1:1200])
)
top_Nl_new <- unlist(lapply(Hl_new, function(x) length(x)))
top_Nl_new
## l1 l2
## 1238 1200
class_res_new <- classification(mND_score_new, X0_gs_adjusted, Hl_new, top = top_Nl_new)
#class_res_new <- classification(mND_score_new_k2, X0_gs_adjusted, Hl_new, top = top_Nl_new)
head(class_res_new$gene_class)
## L1 L2
## TP53 I L
## TRIM28 M NS
## CCNB1 M M
## CCNA2 L M
## AURKB L M
## TPX2 L M
## After obtaining the classification of each gene
## we wish to check the cumulative_decay_score on target genes
## Target Genes are new genes that are obtained
## Cumulative Decay Score = GE_neighbor/d_neighbor_target
## Stategy:
## For ever node in the graph:
## for all modules in the list of old modules:
## distance = get_distance (old_module, node)
## score[node] += Measure(old_module)/distance
##------------------------------------------------
## Now, this Measure could be taken as GE
## or the adjusted -log_10(p-value)
##-------------------------------------------------
## 1. First lets convert the Adjacency Matrix to Distance Matrix
## install.packages("netmeta")
library(netmeta)
## Loading required package: meta
## Loading 'meta' package (version 5.2-0).
## Type 'help(meta)' for a brief overview.
## Readers of 'Meta-Analysis with R (Use R!)' should install
## older version of 'meta' package: https://tinyurl.com/dt4y5drs
## Loading 'netmeta' package (version 2.1-0).
## Type 'help("netmeta-package")' for a brief overview.
## Readers of 'Meta-Analysis with R (Use R!)' should install
## older version of 'netmeta' package: https://tinyurl.com/kyz6wjbb
int_network <- graph_from_adjacency_matrix(A, mode = "undirected")
ge_dist <- calcAllPairsDistances(int_network, directionPaths = "all", networkName = "int_network")
## 2. Second, lets get the decay effect of each gene from gs_results_all
gs_results_all <- read.csv("Data/gs_results_all.csv")
## 3. Get List of all old nodes classification
old_class = class_res_old$gene_class[,2]
## 4. Get list of all new nodes classification
new_class = class_res_new$gene_class[,2]
## 5. Remember - adjusted scores of genes
## X0_gs_adjusted
## 6. initialize empty vector
results_c_decay_score = rep(0,nrow(ge_dist))
new_module = rep(0,nrow(ge_dist))
## 7. Get Cumulative Decay Score for each Gene
for(i in seq(1,nrow(gs_results_all))){
## Gene Attributes
gene = gs_results_all[i,]
g_id = gene$gene.id
g_rd = gene$radius
g_old_class = old_class[g_id]
g_new_class = new_class[g_id]
## Get the measure of the module node
## we choose measyre as -log_10_pvalue
## Other choice is original GE
## measure = X0_gs_adjusted[i,2]
measure <- -log10(gene$p.Decay)
#> print(measure)
## If the gene is an old module
#> then we add its affect to neighboring (d<r) nodes
if(g_old_class == "M"){
## Traverse other genes in the surrounding radius
## and update their cumulative_decay_score
for(j in seq(1, nrow(ge_dist))){
if (j == i){
next
}
## Get distance between the two nodes
g_dist = ge_dist[i,j]
if (g_dist <= g_rd){
## Update cumulative score of gene_j
results_c_decay_score[j] = as.numeric(results_c_decay_score[j]) + measure/g_dist
}
}
}
## If not old module:
#> check if it is a new module,
#> if yes put 1 in the vector
else{
if (g_new_class == "M"){
new_module[i] = 1
}
}
}
## We have 69 new modules - validate this
## Ghadi: 72 with k = 2, 69 with k = 3
sum(new_module==1)
## [1] 69
## How many modules are in new but are not in old
sum(class_res_old$gene_class[,2]=="M")
## [1] 277
sum(class_res_new$gene_class[,2]=="M")
## [1] 314
## Get Module Genes in our method: GS + mND
new_mdg = rownames(class_res_new$gene_class)[class_res_new$gene_class[,2] == "M"]
## Get Module Genes in 1st method: Only mND
old_mdg = rownames(class_res_old$gene_class)[class_res_old$gene_class[,2] == "M"]
#> Count Common Module Genes in our Method
sum(new_mdg %in% old_mdg)
## [1] 245
#> Count Unique Module Genes in both Method
## 32 (k=2) modules discarded by GS
sum(!(old_mdg %in% new_mdg))
## [1] 32
## Ghadi: 72 with k = 2, 69 k = 3
sum(!(new_mdg %in% old_mdg))
## [1] 69
## Get names of unique genes (distinct module genes: mdg)
distinct_mdg = new_mdg[which(!(new_mdg %in% old_mdg))]
discarded_mdg <- new_mdg[which(!(old_mdg %in% new_mdg))]
## Get Indices of unique genes
dmdg_indices = which(rownames(X0_gs_adjusted)%in%distinct_mdg)
discarded_mdg_indices <- which(rownames(X0_gs_adjusted)%in%discarded_mdg)
## Mean Results of Decay Score in 1st method vs our method
mean(results_c_decay_score[dmdg_indices])
## [1] 168.9751
mean(results_c_decay_score[which(!(rownames(X0_gs_adjusted)%in%distinct_mdg))])
## [1] 126.3344
mean(results_c_decay_score[discarded_mdg_indices])
## [1] 180.4112
mean(results_c_decay_score[which(!(rownames(X0_gs_adjusted)%in%discarded_mdg))])
## [1] 126.4374
## Distinct Density Plots
ggplot(as.data.frame(as.matrix(results_c_decay_score[dmdg_indices])), aes(V1)) + geom_density()
ggplot(as.data.frame(as.matrix(results_c_decay_score[which(!(rownames(X0_gs_adjusted)%in%distinct_mdg))])), aes(V1)) + geom_density()
## Overlaying the Density Plots
#> Distinct
a = data.frame(cumulative_decay_score = results_c_decay_score[dmdg_indices])
b = data.frame(y = results_c_decay_score[which(!(rownames(X0_gs_adjusted)%in%distinct_mdg))])
ggplot() +
geom_density(data = a, aes(x = cumulative_decay_score),
fill = "#DAF7A6", color = "black", alpha = 0.7) +
geom_density(data = b, aes(x = y),
fill = "#C70039", color = "black", alpha = 0.7)
#> Discarded
x = data.frame(cumulative_decay_score = results_c_decay_score[discarded_mdg_indices])
y = data.frame(y = results_c_decay_score[which(!(rownames(X0_gs_adjusted)%in%discarded_mdg))])
ggplot() +
geom_density(data = x, aes(x = cumulative_decay_score),
fill = "#DAF7A6", color = "black", alpha = 0.7) +
geom_density(data = y, aes(x = y),
fill = "#C70039", color = "black", alpha = 0.7)
## Significance Test between two vectors which are (distinct):
## 1. results_c_decay_score[dmdg_indices]
## 2. results_c_decay_score[which(!(rownames(X0_gs_adjusted)%in%distinct_mdg))]
## ----------------------------------------------------------------------------
vector1 <- results_c_decay_score[dmdg_indices]
vector2 <- results_c_decay_score[which(!(rownames(X0_gs_adjusted)%in%distinct_mdg))]
## Comparing Variances:
#Using var.test() in order to compare the variances
#Running a two-sided variance test on both vectors with alpha = 5% (0.05) and null hypothesis (H0) -> variances are equal
#ratio (default is 1) is the expected ratio for H0, and the alternative hypothesis (H1) displays the other side (default is two.sided)
var.test(x = vector1, y = vector2, ratio = 1, alternative = c("two.sided", "less", "greater"), conf.level = 0.95)
##
## F test to compare two variances
##
## data: vector1 and vector2
## F = 0.55437, num df = 68, denom df = 11726, p-value = 0.002163
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.4062469 0.8011474
## sample estimates:
## ratio of variances
## 0.5543714
#we obtain a p-value = 0.002163 with a cut-off of 0.05 which is statistically significant
#we obtain a ratio = 0.554 for x/y; This means that the variance of the distinctive genes that were selected as new modules was significantly lower than the old modules for the calculated cumulative decay score
#the 95% confidence interval is [0.406, 0.801]
## Comparing Means:
#Using t.test() in order to compare the means
#mu is either the population's expected mean or the expected difference in the means of the two samples
#mu = 0 as the null hypothesis.
#paired indicates if the data is paired or not
#var.equal indicates if both the samples are treated as equal or not equal w.r.t their variances
#the above var.equal would be based on the var.test done before (in our case = FALSE)
#since our data is unpaired => we perform the following comparison:
t.test(x = vector1, y = vector2, alternative = "two.sided", mu = 0, paired = FALSE, var.equal = FALSE, conf.level = 0.95)
##
## Welch Two Sample t-test
##
## data: vector1 and vector2
## t = 6.6357, df = 69.451, p-value = 5.896e-09
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 29.82285 55.45860
## sample estimates:
## mean of x mean of y
## 168.9751 126.3344
#we obtain a p-value = 5.896e-09 with a cut-off of 0.05 which is statistically significant
#the 95% confidence interval is [29.82, 55.46]
#the means of vector1 and vector2 are 168.97 and 126.33 respectively.
## Next:
## Significance Test between the other two vectors which are (discarded):
## 3. results_c_decay_score[discarded_mdg_indices]
## 4. results_c_decay_score[which(!(rownames(X0_gs_adjusted)%in%discarded_mdg))]
## ----------------------------------------------------------------------------
vector3 <- results_c_decay_score[discarded_mdg_indices]
vector4 <- results_c_decay_score[which(!(rownames(X0_gs_adjusted)%in%discarded_mdg))]
#Now we perform the exact same steps as before and analyze the output for the discarded genes...
## Comparing Variances:
var.test(x = vector3, y = vector4, ratio = 1, alternative = c("two.sided", "less", "greater"), conf.level = 0.95)
##
## F test to compare two variances
##
## data: vector3 and vector4
## F = 0.60504, num df = 31, denom df = 11763, p-value = 0.08254
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.3885567 1.0699331
## sample estimates:
## ratio of variances
## 0.6050369
#we obtain a p-value = 0.08254 with a cut-off of 0.05 which implies that it is not statistically significant
#we obtain a ratio = 0.605 for x/y; This means that the variance of the discarded genes that were selected as new modules was not significantly lower than the old modules for the calculated cumulative decay score
#the 95% confidence interval is [0.388, 1.070]
## Comparing Means:
t.test(x = vector3, y = vector4, alternative = "two.sided", mu = 0, paired = FALSE, var.equal = FALSE, conf.level = 0.95)
##
## Welch Two Sample t-test
##
## data: vector3 and vector4
## t = 5.4947, df = 31.279, p-value = 5.055e-06
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 33.94715 74.00045
## sample estimates:
## mean of x mean of y
## 180.4112 126.4374
#we obtain a p-value = 5.055e-06 with a cut-off of 0.05 which is statistically significant
#the 95% confidence interval is [33.95, 74.00]
#the means of vector3 and vector4 are 180.41 and 126.44 respectively.
## ----------------------------------------------------------------------------
## Sanity Re-check:
sum(class_res_new$gene_class[,1] != class_res_old$gene_class[,1])
## [1] 3453
shift_sm <- table(mND = class_res_old$gene_class[,1], GS_adjusted_mND = class_res_new$gene_class[,1])
shift_ge <- table(mND = class_res_old$gene_class[,2], GS_adjusted_mND = class_res_new$gene_class[,2])
# Percentages over mND genes
results_ge_mND <- shift_ge
for(i in 1:nrow(shift_ge)){
results_ge_mND[i,] <- (shift_ge[i,]/sum(shift_ge[i,]))*100
}
results_ge_mND
## GS_adjusted_mND
## mND I L M NS
## I 17.2264355 22.3185265 5.3087757 55.1462622
## L 17.3199635 19.9635369 8.5688241 54.1476755
## M 13.3574007 29.9638989 45.8483755 10.8303249
## NS 5.2637120 4.2214970 0.4632067 90.0515844
# Total selected mND genes (I, L, M)
11796 - sum(shift_ge[4,])
## [1] 2297
# Percentages over GS_adjusted genes
results_ge_GS_adjusted <- shift_ge
for(i in 1:ncol(shift_ge)){
results_ge_GS_adjusted[,i] <- (shift_ge[,i]/sum(shift_ge[,i]))*100
}
results_ge_GS_adjusted
## GS_adjusted_mND
## mND I L M NS
## I 17.9458239 22.6622662 15.6050955 5.2544647
## L 21.4446953 24.0924092 29.9363057 6.1319294
## M 4.1760722 9.1309131 40.4458599 0.3096934
## NS 56.4334086 44.1144114 14.0127389 88.3039125
# Total selected GS-adjusted mND genes (I, L, M) --> -205 genes
11796 - sum(shift_ge[,4])
## [1] 2109
11796 - sum(shift_ge[,4]) - (11796 - sum(shift_ge[4,]))
## [1] -188
# Percent over all genes
(shift_ge/11796)*100
## GS_adjusted_mND
## mND I L M NS
## I 1.3479145 1.7463547 0.4153950 4.3150220
## L 1.6107155 1.8565615 0.7968803 5.0356053
## M 0.3136656 0.7036283 1.0766361 0.2543235
## NS 4.2387250 3.3994574 0.3730078 72.5161072
## Doing the opposite - for NS vs I,L
## After obtaining the classification of each gene
## we wish to check the cumulative_decay_score on target genes
## Target Genes are new genes that are obtained
## Cumulative Decay Score = GE_neighbor/d_neighbor_target
## Stategy:
## For ever node in the graph:
## for all modules in the list of old modules:
## distance = get_distance (old_module, node)
## score[node] += Measure(old_module)/distance
##------------------------------------------------
## Now, this Measure could be taken as GE
## or the adjusted -log_10(p-value)
##-------------------------------------------------
## 1. First lets convert the Adjacency Matrix to Distance Matrix
## install.packages("netmeta")
## 2. initialize empty vector
results_c_decay_score = rep(0,nrow(ge_dist))
new_module = rep(0,nrow(ge_dist))
## 3. Get Cumulative Decay Score for each Gene
for(i in seq(1,nrow(gs_results_all))){
## Gene Attributes
gene = gs_results_all[i,]
g_id = gene$gene.id
g_rd = gene$radius
g_old_class = old_class[g_id]
g_new_class = new_class[g_id]
## Get the measure of the module node
## we choose measyre as -log_10_pvalue
## Other choice is original GE
## measure = X0_gs_adjusted[i,2]
measure <- -log10(gene$p.Decay)
#> print(measure)
## If the gene is an old module
#> then we add its affect to neighboring (d<r) nodes
if(g_new_class == "NS"){
## Traverse other genes in the surrounding radius
## and update their cumulative_decay_score
for(j in seq(1, nrow(ge_dist))){
if (j == i){
next
}
## Get distance between the two nodes
g_dist = ge_dist[i,j]
if (g_dist <= g_rd){
## Update cumulative score of gene_j
results_c_decay_score[j] = as.numeric(results_c_decay_score[j]) + measure/g_dist
}
}
}
## If not old module:
#> check if it is a new module,
#> if yes put 1 in the vector
else{
if (g_old_class == "L" || g_old_class == "I"){
new_module[i] = 1
}
}
}
## We have 69 new modules - validate this
## Ghadi: 72 with k = 2, 69 with k = 3
sum(new_module==1)
## [1] 1450
## How many modules are in new but are not in old
sum(class_res_old$gene_class[,2]=="I") + sum(class_res_old$gene_class[,2]=="L")
## [1] 2020
sum(class_res_new$gene_class[,2]=="I") + sum(class_res_new$gene_class[,2]=="L")
## [1] 1795
## Get Module Genes in our method: GS + mND
new_mdg = rownames(class_res_new$gene_class)[class_res_new$gene_class[,2] == "I" | class_res_new$gene_class[,2] == "L"]
## Get Module Genes in 1st method: Only mND
old_mdg = rownames(class_res_old$gene_class)[class_res_old$gene_class[,2] == "I" | class_res_old$gene_class[,2] == "L"]
#> Count Common Module Genes in our Method
sum(new_mdg %in% old_mdg)
## [1] 1476
#> Count Unique Module Genes in both Method
## 32 (k=2) modules discarded by GS
sum(!(old_mdg %in% new_mdg))
## [1] 544
## Ghadi: 72 with k = 2, 69 k = 3
sum(!(new_mdg %in% old_mdg))
## [1] 319
## Get names of unique genes (distinct module genes: mdg)
distinct_mdg = new_mdg[which(!(new_mdg %in% old_mdg))]
discarded_mdg <- new_mdg[which(!(old_mdg %in% new_mdg))]
## Get Indices of unique genes
dmdg_indices = which(rownames(X0_gs_adjusted)%in%distinct_mdg)
discarded_mdg_indices <- which(rownames(X0_gs_adjusted)%in%discarded_mdg)
## Mean Results of Decay Score in 1st method vs our method
mean(results_c_decay_score[dmdg_indices])
## [1] 5325.877
mean(results_c_decay_score[which(!(rownames(X0_gs_adjusted)%in%distinct_mdg))])
## [1] 4907.929
mean(results_c_decay_score[discarded_mdg_indices])
## [1] 5324.014
mean(results_c_decay_score[which(!(rownames(X0_gs_adjusted)%in%discarded_mdg))])
## [1] 4903.844
## Distinct Density Plots
ggplot(as.data.frame(as.matrix(results_c_decay_score[dmdg_indices])), aes(V1)) + geom_density()
ggplot(as.data.frame(as.matrix(results_c_decay_score[which(!(rownames(X0_gs_adjusted)%in%distinct_mdg))])), aes(V1)) + geom_density()
## Overlaying the Density Plots
#> Distinct
a = data.frame(cumulative_decay_score = results_c_decay_score[dmdg_indices])
b = data.frame(y = results_c_decay_score[which(!(rownames(X0_gs_adjusted)%in%distinct_mdg))])
ggplot() +
geom_density(data = a, aes(x = cumulative_decay_score),
fill = "#DAF7A6", color = "black", alpha = 0.7) +
geom_density(data = b, aes(x = y),
fill = "#C70039", color = "black", alpha = 0.7)
#> Discarded
x = data.frame(cumulative_decay_score = results_c_decay_score[discarded_mdg_indices])
y = data.frame(y = results_c_decay_score[which(!(rownames(X0_gs_adjusted)%in%discarded_mdg))])
ggplot() +
geom_density(data = x, aes(x = cumulative_decay_score),
fill = "#DAF7A6", color = "black", alpha = 0.7) +
geom_density(data = y, aes(x = y),
fill = "#C70039", color = "black", alpha = 0.7)
## ============================================================================
## Analyzing coverage of Breast Cancer Related Genes - Ghadi
## ============================================================================
## Getting Disease-related genes with AutoSeed
## (includes eDGAR, DrugBank, and MalaCards)
## install.packages("Autoseed")
library(Autoseed)
bc_genes_search <- AutoSeed("breast cancer")
bc_genes_search2 <- AutoSeed("breast carcinoma")
bc_genes <- Reduce(union, c(bc_genes_search$edgar,bc_genes_search$malacards,
bc_genes_search$drugbank,
bc_genes_search2$edgar,bc_genes_search2$malacards,
bc_genes_search2$drugbank))
bc_genes_clean <- unique(vapply(bc_genes, function(x){
return(unlist(strsplit(x, split='::', fixed=TRUE))[1])},
c("x")))
bc_genes_clean <- bc_genes_clean[bc_genes_clean%in%rownames(X0)]
# Matched distinct genes
sum(distinct_mdg %in% bc_genes_clean)
## [1] 111
# Matched discarded genes
sum(discarded_mdg %in% bc_genes_clean)
## [1] 106
# Matched old M genes
sum(old_mdg %in% bc_genes_clean)
## [1] 516
# Matched new M genes
sum(new_mdg %in% bc_genes_clean)
## [1] 478
genes_selected_old <- rownames(class_res_old$gene_class %>% filter(L2 != "NS"))
genes_selected_new <- rownames(class_res_new$gene_class %>% filter(L2 != "NS"))
# Matched selected old
sum(genes_selected_old %in% bc_genes_clean)/length(bc_genes_clean)*100
## [1] 21.75215
# Matched selected new
sum(genes_selected_new %in% bc_genes_clean)/length(bc_genes_clean)*100
## [1] 20.77873
# Analyzing coverage by mNDp cutoffs
cutoffs <- seq(0.0001, 0.06, 0.0001)
genes_selected_cutoffs <- data.frame(mNDp_cutoffs = 1:length(cutoffs),
old = 1:length(cutoffs),
new = 1:length(cutoffs),
new_k2 = 1:length(cutoffs))
for(i in 1:length(cutoffs)){
# %>% filter(L2 != "NS"))
genes_cutoff_old <- rownames(mND_score$mND %>% filter(mNDp <= cutoffs[i]))
# %>% filter(L2 != "NS"))
genes_cutoff_new <- rownames(mND_score_new$mND %>% filter(mNDp <= cutoffs[i]))
genes_cutoff_new_k2 <- rownames(mND_score_new_k2$mND %>% filter(mNDp <= cutoffs[i]))
#print(paste("Cutoff:", i))
# Matched selected by mNDp cutoff old
old <- sum(genes_cutoff_old %in% bc_genes_clean)/length(bc_genes_clean)*100
#print("Old")
#print(old)
# Matched selected by mNDp cutoff new
new <- sum(genes_cutoff_new %in% bc_genes_clean)/length(bc_genes_clean)*100
#print("New (k = 3)")
#print(new)
# Matched selected by mNDp cutoff new k = 2
new_k2 <- sum(genes_cutoff_new_k2 %in% bc_genes_clean)/length(bc_genes_clean)*100
#print("New (k = 2)")
#print(new_k2)
genes_selected_cutoffs[i,] <- c(cutoffs[i], old, new, new_k2)
}
# Interesting how we get more coverage no matter the cutoff although converges
# slightly at xtrmly low cutoffs. k = 2 shows slightly less coverage but at lower
# cutoffs coverage is almost the same (seems that lower k prioritizes important genes)
ggplot() +
geom_line(data = genes_selected_cutoffs, aes(cutoffs, old), color = "black") +
geom_line(data = genes_selected_cutoffs, aes(cutoffs, new), color = "red") +
geom_line(data = genes_selected_cutoffs, aes(cutoffs, new_k2), color = "blue", alpha = 0.5) +
ylab("Cumulative Coverage (%)") +
xlab("mNDp Cutoff") +
ggtitle("Cumulative Coverage of disease-related genes by mNDp cutoffs",
subtitle = "mND (k = 3) in black, GS-adjusted mND (k = 2) in blue, GS-adjusted mND (k = 3) in red")
## ===========================================================================
## Enrichment Analysis - Ghadi
## ===========================================================================
#BiocManager::install("KEGGREST")
library(KEGGREST)
pathways.list <- keggList("pathway", "hsa") #Needs Internet connection
pathway.codes <- sub("path:", "", names(pathways.list))
genes.by.pathway <- sapply(pathway.codes,
function(pwid){
pw <- keggGet(pwid)
if (is.null(pw[[1]]$GENE)) return(NA)
pw2 <- pw[[1]]$GENE[c(FALSE,TRUE)] # may need to modify this to c(FALSE, TRUE) for other organisms
pw2 <- unlist(lapply(strsplit(pw2, split = ";", fixed = T), function(x)x[1]))
return(pw2)
}
)
saveRDS(genes.by.pathway, "Data/genes.by.pathway.rds")
enrich <- function(mND_score, pathways.list, pathway.codes, genes.by.pathway)
{
geneList <- mND_score$mND$mNDp
names(geneList) <- rownames(mND_score$mND)
# Wilcoxon test for each pathway
pVals.by.pathway <- t(sapply(names(genes.by.pathway),
function(pathway) {
pathway.genes <- genes.by.pathway[[pathway]]
list.genes.in.pathway <- intersect(names(geneList), pathway.genes)
list.genes.not.in.pathway <- setdiff(names(geneList), list.genes.in.pathway)
scores.in.pathway <- geneList[list.genes.in.pathway]
scores.not.in.pathway <- geneList[list.genes.not.in.pathway]
if (length(scores.in.pathway) > 0){
p.value <- wilcox.test(scores.in.pathway, scores.not.in.pathway, alternative = "less")$p.value
} else{
p.value <- NA
}
return(c(p.value = p.value, Annotated = length(list.genes.in.pathway)))
}
))
# Assemble output table
outdat <- data.frame(pathway.code = rownames(pVals.by.pathway))
outdat$pathway.name <- pathways.list[paste("path:",outdat$pathway.code, sep = "")]
outdat$p.value <- pVals.by.pathway[,"p.value"]
outdat$Annotated <- pVals.by.pathway[,"Annotated"]
return(outdat[order(outdat$p.value),])
}
enrichment_old <- enrich(mND_score, pathways.list, pathway.codes, genes.by.pathway)
enrichment_new <- enrich(mND_score_new, pathways.list, pathway.codes, genes.by.pathway)
write.csv(enrichment_old, "Results/Enrichment/enrichment_old.csv")
write.csv(enrichment_new, "Results/Enrichment/enrichment_new.csv")
# Write and Load Codes
genes.by.pathway <- readRDS("Data/genes.by.pathway.rds")
enrichment_old <- read.csv("Results/Enrichment/enrichment_old.csv")
enrichment_new <- read.csv("Results/Enrichment/enrichment_new.csv")
comparative_enrich <- function(X0, mND_scores = list(), pathways.list, pathway.codes, genes.by.pathway){
result <- list()
for(i in 1:length(genes.by.pathway)){
pathway_result <- data.frame(matrix(nrow = length(seq (50, 500, 1)),
ncol = length(mND_scores)+2))
colnames(pathway_result) <- append(names(mND_scores), c("DE_scores","cutoff"))
count <- 0
for(mND_score in mND_scores){
count <- count + 1
count2 <- 0
genes <- mND_score$mND %>% arrange(desc(mND))
control_genes <- sort(X0[,2], decreasing = TRUE)
for(cutoff in seq (50, 500, 1)){
count2 <- count2 + 1
pathway_result[count2,count] <- sum(rownames(genes[1:cutoff,]) %in% genes.by.pathway[[i]])/length(genes.by.pathway[[i]])*100
pathway_result[count2,ncol(pathway_result)] <- cutoff
if(count == 1){
pathway_result[count2,ncol(pathway_result)-1] <- sum(names(control_genes[1:cutoff]) %in% genes.by.pathway[[i]])/length(genes.by.pathway[[i]])*100
}
}
}
result[[pathways.list[[i]]]] <- pathway_result
}
return(result)
}
comparative_enrichment <- comparative_enrich(X0, list(mND_old_k3 = mND_score,
mND_new_k3 = mND_score_new,
mND_new_k2 = mND_score_new_k2),
pathways.list, pathway.codes, genes.by.pathway)
bc_pathways <- c("Breast cancer - Homo sapiens (human)",
"Epstein-Barr virus infection - Homo sapiens (human)",
"Pathways in cancer - Homo sapiens (human)",
"Viral carcinogenesis - Homo sapiens (human)")
other_pathways <- c("Cell cycle - Homo sapiens (human)",
"FoxO signaling pathway - Homo sapiens (human)",
"Hippo signaling pathway - Homo sapiens (human)",
"Oocyte meiosis - Homo sapiens (human)",
"p53 signaling pathway - Homo sapiens (human)",
"PI3K-Akt signaling pathway - Homo sapiens (human)",
"Progesterone-mediated oocyte maturation - Homo sapiens (human)",
"Proteasome - Homo sapiens (human)",
"Rap1 signaling pathway - Homo sapiens (human)",
"Ras signaling pathway - Homo sapiens (human)",
"Sphingolipid signaling pathway - Homo sapiens (human)",
"TGF-beta signaling pathway - Homo sapiens (human)")
for(pathway in append(bc_pathways, other_pathways)){
plot <- ggplot(data = comparative_enrichment[[pathway]]) +
geom_line(aes(cutoff, mND_old_k3), color = "black") +
geom_line(aes(cutoff, mND_new_k3), color = "red") +
geom_line(aes(cutoff, mND_new_k2), color = "blue") +
geom_line(aes(cutoff, DE_scores), color = "magenta") +
ylab("Cumulative Coverage (%)") +
xlab("Cutoff of sorted gene list") +
ggtitle(pathway)
#subtitle = "Black: mND (k = 3), Blue: GS-adjusted mND (k = 2),
#Red: GS-adjusted mND (k = 3), Magenta: original DE scores")
print(plot)
}
## =========================================================================
## Visualization of M & L (Looks like neighbors are not included in class M)
## =========================================================================
## mND Graph
library(visNetwork)
genes_subset_mND <- class_res_old$gene_class %>% filter(!(L2 %in% c("NS", "I")))
A_subset_mND <- A[rownames(A) %in% rownames(genes_subset_mND),
colnames(A) %in% rownames(genes_subset_mND)]
graph_mND <- graph_from_adjacency_matrix(A_subset_mND) %>%
set_vertex_attr("class", value = genes_subset_mND[,2])
vis_mND <- toVisNetworkData(graph_mND)
visNetwork(nodes = vis_mND$nodes, edges = vis_mND$edges) %>%
visIgraphLayout(layout = "layout_nicely") %>%
visOptions(nodesIdSelection = TRUE, selectedBy = "class") %>%
visNodes(color = vis_mND$nodes$class)
## GS-adjusted mND graph
genes_subset_adjusted_mND <- class_res_new$gene_class %>% filter(!(L2 %in% c("NS", "I")))
A_subset_adjusted_mND <- A[rownames(A) %in% rownames(genes_subset_adjusted_mND),
colnames(A) %in% rownames(genes_subset_adjusted_mND)]
graph_adjusted_mND <- graph_from_adjacency_matrix(A_subset_adjusted_mND) %>%
set_vertex_attr("class", value = genes_subset_adjusted_mND[,2])
vis_adjusted_mND <- toVisNetworkData(graph_adjusted_mND)
visNetwork(nodes = vis_adjusted_mND$nodes, edges = vis_adjusted_mND$edges) %>%
visIgraphLayout(layout = "layout_nicely") %>%
visOptions(nodesIdSelection = TRUE, selectedBy = "class")
## =========================================================================
assess_connectivity <- function(A, mND_score, n){
objective <- function(S1, A){
return(t(as.matrix(S1))%*%as.matrix(A)%*%as.matrix(S1))
}
permute_A <- function(A, n){
result <- list()
for(i in 1:n){
permuted_A <- A[sample(nrow(A)),]
rownames(permuted_A) <- rownames(A)
result[[i]] <- permuted_A
}
return(result)
}
print("Permuting Vertices...")
permuted_As <- permute_A(A, n)
genes <- mND_score %>% arrange(desc(mND))
results <- data.frame(matrix(ncol = 3, nrow = length(5:nrow(mND_score))))
colnames(results) <- c("cutoff", "omega", "p_omega")
count <- 0
for(cutoff in 5:nrow(mND_score)){
print(paste("Cutoff:", cutoff))
count <- count + 1
results[count,1] <- cutoff
top_genes <- rownames(genes[1:cutoff,])
sub_A <- A[rownames(A) %in% top_genes, colnames(A) %in% top_genes]
print("Calculating Observed Omega...")
observed_omega <- objective(genes[1:cutoff,]$mND, sub_A)
results[count,2] <- observed_omega
print("Calculating Permutated Omegas...")
null_omega <- vector()
for(i in 1:length(permuted_As)){
sub_permuted_A <- permuted_As[[i]][rownames(permuted_As[[i]]) %in% top_genes, colnames(permuted_As[[i]]) %in% top_genes]
null_omega[i] <- objective(genes[1:cutoff,]$mND, sub_permuted_A)
}
print("Assessing Significance...")
p_omega <- sum(as.vector(null_omega) >= as.vector(observed_omega))/length(null_omega)
results[count,3] <- p_omega
print(paste("Done with cutoff", cutoff))
}
return(results)
print("Success!")
}
ordered_scores_old <- (mND_score$mND %>% arrange(desc(mND)))[1:1000,]
sub_A_old <- A[rownames(A) %in% rownames(ordered_scores_old), colnames(A) %in% rownames(ordered_scores_old)]
connectivity_old <- assess_connectivity(sub_A_old, ordered_scores_old, 300)
write.csv(connectivity_old, "Results/Connectivity/connectivity_old.csv")
ordered_scores_new <- (mND_score_new$mND %>% arrange(desc(mND)))[1:1000,]
sub_A_new <- A[rownames(A) %in% rownames(ordered_scores_new), colnames(A) %in% rownames(ordered_scores_new)]
connectivity_new <- assess_connectivity(sub_A_new, ordered_scores_new, 300)
write.csv(connectivity_new, "Results/Connectivity/connectivity_new.csv")
ordered_scores_new_k2 <- (mND_score_new_k2$mND %>% arrange(desc(mND)))[1:1000,]
sub_A_new_k2 <- A[rownames(A) %in% rownames(ordered_scores_new_k2), colnames(A) %in% rownames(ordered_scores_new_k2)]
connectivity_new_k2 <- assess_connectivity(sub_A_new_k2, ordered_scores_new_k2, 300)
write.csv(connectivity_new_k2, "Results/Connectivity/connectivity_new_k2.csv")
ordered_scores_control <- (as.data.frame(X0) %>% mutate(mND = L2) %>% arrange(desc(mND)))[1:1000,]
sub_A_control <- A[rownames(A) %in% rownames(ordered_scores_control), colnames(A) %in% rownames(ordered_scores_control)]
connectivity_control <- assess_connectivity(sub_A_control, ordered_scores_control, 300)
write.csv(connectivity_control, "Results/Connectivity/connectivity_control.csv")
connectivity_old <- read.csv("Results/Connectivity/connectivity_old.csv")
connectivity_new <- read.csv("Results/Connectivity/connectivity_new.csv")
connectivity_new_k2 <- read.csv("Results/Connectivity/connectivity_new_k2.csv")
connectivity_control <- read.csv("Results/Connectivity/connectivity_control.csv")
ggplot() +
geom_line(data = connectivity_old[1:150,], aes(cutoff, p_omega), color = "black") +
geom_line(data = connectivity_new[1:150,], aes(cutoff, p_omega), color = "red") +
geom_line(data = connectivity_new_k2[1:150,], aes(cutoff, p_omega), color = "blue") +
geom_line(data = connectivity_control[1:150,], aes(cutoff, p_omega), color = "magenta") +
ylab("Omega p-value") +
xlab("Cutoff of sorted gene list") +
ggtitle("Significance of enriched connectivity across cutoffs of sorted gene list",
subtitle = "Black: mND (k=3), Blue,Red: GS-adjusted mND (k=2,3), Magenta: original DE scores")
omegas <- connectivity_old[1:996,] %>%
mutate(omega_old = omega) %>%
select(omega_old) %>%
mutate(omega_new = connectivity_new$omega,
omega_new_k2 = connectivity_new_k2$omega,
omega_control = connectivity_control$omega)
ggplot(omegas) +
geom_boxplot(aes("mND (k=3)", log(omega_old)), color = "black") +
geom_boxplot(aes("GS-adjusted (k=3)", log(omega_new)), color = "red") +
geom_boxplot(aes("GS-adjusted (k=2)", log(omega_new_k2)), color = "blue") +
geom_boxplot(aes("original DE", log(omega_control)), color = "magenta") +
ylab("log(Omega)") +
xlab("Score") +
ggtitle("Distribution of enriched connectivity score across cutoffs sorted of genes",
subtitle = "Black: mND (k=3), Blue,Red: GS-adjusted mND (k=2,3), Magenta: original DE")
## Warning: Removed 11 rows containing non-finite values (stat_boxplot).
ggplot() +
geom_boxplot(data = mND_score$mND, aes("mND (k=3)", log(mND)), color = "black") +
geom_boxplot(data = mND_score_new$mND, aes("GS-adjusted (k=3)", log(mND)), color = "red") +
geom_boxplot(data = mND_score_new_k2$mND, aes("GS-adjusted (k=2)", log(mND)), color = "blue") +
geom_boxplot(data = data.frame(X0), aes("original DE", log(L2)), color = "magenta") +
ylab("log(Value)") +
xlab("Score") +
ggtitle("Distribution of scores",
subtitle = "Black: mND (k=3), Blue,Red: GS-adjusted mND (k=2,3), Magenta: original DE")
## Warning: Removed 2399 rows containing non-finite values (stat_boxplot).